In [1]:
%matplotlib inline
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import os
Objective (from Kaggle site):
This problem was discussed in AWS Machine Learning resources: https://blogs.aws.amazon.com/bigdata/post/Tx2OZ63RJ6Z41A0/Building-a-Numeric-Regression-Model-with-Amazon-Machine-Learning
Strategy:
We will create two separate Linear Regression models:
1. Casual Rental Prediction
2. Registered Rental Prediction
We will then combine the results to find the total count for submission to Kaggle. Competition is already over, but we can submit to check accuracy of the model.
Download Train.csv, Test.csv, samplesubmission.csv files from above Kaggle link and store it in C:\AWSMLCourse\Data\RegressionExamples\BikeTrain
This notebook file reads the Train.csv file and creates two separate training files one for registered user rentals and another file for casual user rentals
Following additional features are added: Month, Day, Hour, DayofWeek
Test.csv file is read and additional features are created
Following files are generated:
Following schema files are provided for creating datasource. Copy these schema files to the S3 bucket/folder where you kept the csv training files.
In [2]:
data_path = r'..\Data\RegressionExamples\BikeTrain'
In [3]:
df = pd.read_csv(os.path.join(data_path,'train.csv'),
parse_dates = ['datetime'])
In [4]:
df.head()
Out[4]:
In [5]:
# Add some additional features to look at data
df['month'] = df.datetime.dt.month
df['day'] = df.datetime.dt.day
df['hour'] = df.datetime.dt.hour
df['dayofweek'] = df.datetime.dt.dayofweek # The day of the week with Monday=0, Sunday=6
In [6]:
df.groupby([df.dayofweek])['casual'].sum()
Out[6]:
In [7]:
df.groupby([df.dayofweek])['registered'].sum()
Out[7]:
In [8]:
df.casual.describe()
Out[8]:
In [9]:
df.registered.describe()
Out[9]:
In [10]:
df.corr()
Out[10]:
In [11]:
# Pre-shuffle the data for consistency
# Shuffled data would be used for training
np.random.seed(5)
l = list(range(df.shape[0]))
np.random.shuffle(l)
In [12]:
# randomize
df = df.iloc[l]
In [13]:
df.head()
Out[13]:
In [14]:
# Create the following files:
# We need to create two separate models:
# Model 1: casual/non-registered users rental count by hour,
# Model 2: registered users rental count by hour
# casual users training data
df.to_csv(os.path.join(data_path,'bike_rental_train_casual.csv'),
index = False,
columns=['datetime', 'month', 'day', 'hour', 'dayofweek', 'season', 'holiday',
'workingday', 'weather', 'temp','atemp', 'humidity', 'windspeed', 'casual'])
# registered users training data
df.to_csv(os.path.join(data_path,'bike_rental_train_registered.csv'),
index = False,
columns=['datetime', 'month', 'day', 'hour', 'dayofweek', 'season', 'holiday',
'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'registered'])
# save all data - for reference
df.to_csv(os.path.join(data_path,'bike_rental_all.csv'),
index = False,
columns = ['datetime', 'month', 'day', 'hour', 'dayofweek', 'season', 'holiday',
'workingday', 'weather', 'temp','atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count'])
In [15]:
# Create the test set with new features
# test set provided by kaggle.
df_test = pd.read_csv(os.path.join(data_path,'test.csv'),
parse_dates = ['datetime'])
df_test['month'] = df_test.datetime.dt.month
df_test['day'] = df_test.datetime.dt.day
df_test['hour'] = df_test.datetime.dt.hour
df_test['dayofweek'] = df_test.datetime.dt.dayofweek
In [16]:
df_test.to_csv(os.path.join(data_path,'bike_rental_test.csv'),
index=False,
columns=['datetime', 'month', 'day', 'hour', 'dayofweek', 'season', 'holiday',
'workingday', 'weather', 'temp','atemp', 'humidity', 'windspeed'])
In [17]:
df_test.shape
Out[17]:
In [18]:
df.holiday.value_counts()
Out[18]:
In [19]:
df.workingday.value_counts()
Out[19]:
In [20]:
df.weather.value_counts()
Out[20]:
In [21]:
# read predicted values for casual and registered users.
# sum it up and prepare for submission
df_casual_predicted = pd.read_csv(os.path.join(data_path,
'output_casual',
'bp-jLEzJYrQqDj-bike_rental_test.csv.gz'))
In [22]:
df_casual_predicted.head()
Out[22]:
In [23]:
def adjust_score(x):
if x < 0:
return 0
else:
return x
In [24]:
df_casual_predicted['casual_rental'] = df_casual_predicted.score.map(adjust_score)
In [25]:
df_casual_predicted.tail()
Out[25]:
In [26]:
df_casual_predicted.casual_rental.hist(bins = 20)
Out[26]:
In [27]:
df_registered_predicted = pd.read_csv(os.path.join(data_path,
'output_registered',
'bp-CnT4idlhEIa-bike_rental_test.csv.gz'))
In [28]:
df_registered_predicted['registered_rental'] = df_registered_predicted.score.map(adjust_score)
In [29]:
df_registered_predicted.head()
Out[29]:
In [30]:
df_registered_predicted.registered_rental.describe()
Out[30]:
In [31]:
df_registered_predicted.registered_rental.hist(bins = 20)
Out[31]:
In [32]:
df_registered_predicted["casual_rental"] = df_casual_predicted.casual_rental
In [33]:
df_registered_predicted['count'] = df_registered_predicted.registered_rental + df_registered_predicted.casual_rental
In [34]:
df_registered_predicted.tail()
Out[34]:
In [35]:
df_registered_predicted.columns = ['datetime',
'score',
'registered_rental',
'casual_rental',
'count']
In [36]:
df_registered_predicted.tail()
Out[36]:
In [37]:
# File ready to submit on Kaggle
df_registered_predicted.to_csv(os.path.join(data_path,
'predicted_test_kaggle.csv'),
index = False,
columns = ['datetime', 'count'])